#1. Load Required Packages
required_packages <- c("readxl","nortest","tidyverse","dplyr")
for (package in required_packages) {
  if (!requireNamespace(package, quietly = TRUE)) {
    install.packages(package)
  }
  library(package, character.only = TRUE)
}
#Description: This section ensures the required packages (readxl, nortest, tidyverse, dplyr are installed and loaded.

#2. Import Data Frame
if (!exists("df")) {
  file_path <- "~/COHORT_Prediction_of_CIP.xlsx"
  df <- read_excel(file_path, sheet = "cohort", na = "#N/A")
}
#Description: This section imports data from an Excel file if the data frame df does not already exist.


#3. Filter Data
df_filtered <- df[, c("r_upper_suv_mean", "r_upper_suv_min", "r_upper_suv_max","r_upper_suv_sd", 
                      "r_lower_suv_mean", "r_lower_suv_min", "r_lower_suv_max","r_lower_suv_sd",
                      "l_upper_suv_mean", "l_upper_suv_min", "l_upper_suv_max","l_upper_suv_sd",
                      "l_lower_suv_mean", "l_lower_suv_min", "l_lower_suv_max","l_lower_suv_sd",
                      "pneumonitis","emphysema","copd","pet_device","pet_distance_ici",
                      "sex_male","bmi","weight","size","pet_impossible_spheres_placement","side_r0_l1")]

df_filtered_distance <- subset(df_filtered, pet_distance_ici <= 365 | is.na(pet_distance_ici))
df_filtered_distance <- subset(df_filtered_distance, pet_distance_ici >= -1)
df_filtered <- subset(df_filtered_distance, pet_impossible_spheres_placement == 0)
df_filtered[] <- lapply(df_filtered, as.numeric)
#Description: This section filters the data to include only records where distance between PET/CT and immunotherapy is between -1 and 365 days.


#4. Add Columns for Calculations
cols <- c("no_tumor_suv_mean", "no_tumor_suv_min", "no_tumor_suv_max", "no_tumor_suv_sd", 
          "suv_95", "no_tumor_suv_95", "upper_suv_95", "lower_suv_95", 
          "sul_lung_max","upper_sul_max","lower_sul_max","no_tumor_sul_max",
          "sul_lung_mean","upper_sul_mean","lower_sul_mean","no_tumor_sul_mean","LBM")
df_filtered[,cols] <- NA
# Description: This section adds new columns to the filtered data frame for various calculated metrics.


#5. Define Functions for Calculations
# Define the function to calculate Lean Body Mass (LBM) based on BMI, height, and weight
calculate_LBM <- function(bmi, size, weight, sex) {
  size <- size*100
  if (sex == 1) {
    # male
    lbm <- (9.27 * 10^3 * weight) / ((6.68 * 10^3) + (216 * bmi))
  } else if (sex == 0) {
    # female
    lbm <- (9.27 * 10^3 * weight) / ((8.78 * 10^3) + (244 * bmi))
  } 
  return(lbm)
}

# Define the function to calculate SULmean and SULmax
calculate_SUL <- function(suv, lbm, weight) {
  sul <- NA
  if(!is.na(suv)){
    sul <- (suv / weight) * lbm
  }
  return(sul)
}
# Description: This section defines functions to calculate Lean Body Mass (LBM) and Standardized Uptake Value (SUL).


#6. Calculate SUV and Related Metrics
for (i in 1:nrow(df_filtered)) {
  if (is.na(df_filtered$l_upper_suv_mean[i]) & !is.na(df_filtered$r_upper_suv_mean[i])) {
    df_filtered$l_upper_suv_max[i] <- df_filtered$r_upper_suv_max[i]
    df_filtered$l_upper_suv_min[i] <- df_filtered$r_upper_suv_min[i]
    df_filtered$l_upper_suv_mean[i] <- df_filtered$r_upper_suv_mean[i]
    df_filtered$l_upper_suv_sd[i] <- df_filtered$r_upper_suv_sd[i]
  }
  if (is.na(df_filtered$r_upper_suv_mean[i]) & !is.na(df_filtered$l_upper_suv_mean[i])) {
    df_filtered$r_upper_suv_max[i] <- df_filtered$l_upper_suv_max[i]
    df_filtered$r_upper_suv_min[i] <- df_filtered$l_upper_suv_min[i]
    df_filtered$r_upper_suv_mean[i] <- df_filtered$l_upper_suv_mean[i]
    df_filtered$r_upper_suv_sd[i] <- df_filtered$l_upper_suv_sd[i]
  }
  if (is.na(df_filtered$l_lower_suv_mean[i]) & !is.na(df_filtered$r_lower_suv_mean[i])) {
    df_filtered$l_lower_suv_max[i] <- df_filtered$r_lower_suv_max[i]
    df_filtered$l_lower_suv_min[i] <- df_filtered$r_lower_suv_min[i]
    df_filtered$l_lower_suv_mean[i] <- df_filtered$r_lower_suv_mean[i]
    df_filtered$l_lower_suv_sd[i] <- df_filtered$r_lower_suv_sd[i]
  }
  if (is.na(df_filtered$r_lower_suv_mean[i]) & !is.na(df_filtered$l_lower_suv_mean[i])) {
    df_filtered$r_lower_suv_max[i] <- df_filtered$l_lower_suv_max[i]
    df_filtered$r_lower_suv_min[i] <- df_filtered$l_lower_suv_min[i]
    df_filtered$r_lower_suv_mean[i] <- df_filtered$l_lower_suv_mean[i]
    df_filtered$r_lower_suv_sd[i] <- df_filtered$l_lower_suv_sd[i]
  }
  if (!is.na(df_filtered$side_r0_l1[i])) {
    if (df_filtered$side_r0_l1[i] == 1 & !is.na(df_filtered$l_lower_suv_mean[i])) {
      df_filtered$no_tumor_suv_mean[i] <- mean(df_filtered$l_lower_suv_mean[i], df_filtered$l_upper_suv_mean[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_min[i] <- mean(df_filtered$l_lower_suv_min[i], df_filtered$l_upper_suv_min[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_max[i] <- mean(df_filtered$l_lower_suv_max[i], df_filtered$l_upper_suv_max[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_sd[i] <- mean(df_filtered$l_lower_suv_sd[i], df_filtered$l_upper_suv_sd[i], na.rm = TRUE)
    } else if (df_filtered$side_r0_l1[i] == 0 & !is.na(df_filtered$r_lower_suv_mean[i])) {
      df_filtered$no_tumor_suv_mean[i] <- mean(df_filtered$r_lower_suv_mean[i], df_filtered$r_upper_suv_mean[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_min[i] <- mean(df_filtered$r_lower_suv_min[i], df_filtered$r_upper_suv_min[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_max[i] <- mean(df_filtered$r_lower_suv_max[i], df_filtered$r_upper_suv_max[i], na.rm = TRUE)
      df_filtered$no_tumor_suv_sd[i] <- mean(df_filtered$r_lower_suv_sd[i], df_filtered$r_upper_suv_sd[i], na.rm = TRUE)
    }
  }
}

df_filtered$upper_suv_mean <- rowMeans(df_filtered[, c("r_upper_suv_mean", "l_upper_suv_mean")], na.rm = TRUE)
df_filtered$upper_suv_max <- rowMeans(df_filtered[, c("r_upper_suv_max", "l_upper_suv_max")], na.rm = TRUE)
df_filtered$upper_suv_min <- rowMeans(df_filtered[, c("r_upper_suv_min", "l_upper_suv_min")], na.rm = TRUE)
df_filtered$upper_suv_sd <- rowMeans(df_filtered[, c("r_upper_suv_sd", "l_upper_suv_sd")], na.rm = TRUE)

df_filtered$lower_suv_mean <- rowMeans(df_filtered[, c("r_lower_suv_mean", "l_lower_suv_mean")], na.rm = TRUE)
df_filtered$lower_suv_max <- rowMeans(df_filtered[, c("r_lower_suv_max", "l_lower_suv_max")], na.rm = TRUE)
df_filtered$lower_suv_min <- rowMeans(df_filtered[, c("r_lower_suv_min", "l_lower_suv_min")], na.rm = TRUE)
df_filtered$lower_suv_sd <- rowMeans(df_filtered[, c("r_lower_suv_sd", "l_lower_suv_sd")], na.rm = TRUE)
 
df_filtered$suv_lung_mean <- rowMeans(df_filtered[, c("upper_suv_mean", "lower_suv_mean")], na.rm = TRUE)
df_filtered$suv_lung_max <- rowMeans(df_filtered[, c("upper_suv_max", "lower_suv_max")], na.rm = TRUE)
df_filtered$suv_lung_min <- rowMeans(df_filtered[, c("upper_suv_min", "lower_suv_min")], na.rm = TRUE)
df_filtered$suv_lung_sd <- rowMeans(df_filtered[, c("upper_suv_sd", "lower_suv_sd")], na.rm = TRUE)

for (i in 1:nrow(df_filtered)) {
  df_filtered$suv_95[i] <- df_filtered$suv_lung_mean[i] + (qnorm(0.95) * df_filtered$suv_lung_sd[i])
  if (!is.na(df_filtered$no_tumor_suv_mean[i])) {
    df_filtered$no_tumor_suv_95[i] <- df_filtered$no_tumor_suv_mean[i] + (qnorm(0.95) * df_filtered$no_tumor_suv_sd[i])
    df_filtered$upper_suv_95[i] <- df_filtered$upper_suv_mean[i] + (qnorm(0.95) * df_filtered$upper_suv_sd[i])
    df_filtered$lower_suv_95[i] <- df_filtered$lower_suv_mean[i] + (qnorm(0.95) * df_filtered$lower_suv_sd[i])
  }
  df_filtered$LBM[i] <- calculate_LBM(df_filtered$bmi[i], df_filtered$size[i], df_filtered$weight[i], df_filtered$sex_male[i])
  df_filtered$sul_lung_mean[i] <- calculate_SUL(df_filtered$suv_lung_mean[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$sul_lung_max[i] <- calculate_SUL(df_filtered$suv_lung_max[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$upper_sul_mean[i] <- calculate_SUL(df_filtered$upper_suv_mean[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$upper_sul_max[i] <- calculate_SUL(df_filtered$upper_suv_max[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$lower_sul_mean[i] <- calculate_SUL(df_filtered$lower_suv_mean[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$lower_sul_max[i] <- calculate_SUL(df_filtered$lower_suv_max[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$no_tumor_sul_mean[i] <- calculate_SUL(df_filtered$no_tumor_suv_mean[i], df_filtered$LBM[i], df_filtered$size[i])
  df_filtered$no_tumor_sul_max[i] <- calculate_SUL(df_filtered$no_tumor_suv_max[i], df_filtered$LBM[i], df_filtered$size[i])
}
# Description: This section fills in missing SUV values and calculates various metrics based on available data

#7. Create Subgroup and Text Information Output
df_filtered_subgroup <- subset(df_filtered, pet_device == 6)
print(paste0("for this calculation, the subgroup of ", nrow(df_filtered_subgroup), " patients scanned with the same PET device were used (", sum(df_filtered_subgroup$pneumonitis == 1), "/",sum(df_filtered$pneumonitis == 1)," of all patients with pneumonitis, ",sum(df_filtered_subgroup$pneumonitis == 0),"/",sum(df_filtered$pneumonitis == 0)," patients without pneumonitis)"))
# Description: Creates a subgroup with only patients scanned using the same PET scanner and prints the distribution of pneumonitis status.

#8. Define Function to Run Numerical Tests
run_num_tests <- function(var) {
  df_run_num_tests <- df_filtered
  if (var %in% variables_subgroup) {
    df_run_num_tests <- df_filtered_subgroup
  } 
  lillie_test <- lillie.test(df_run_num_tests[[var]])
  if (lillie_test$p.value < 0.01) {
    test <- wilcox.test(df_run_num_tests[[var]] ~ pneumonitis, data = df_run_num_tests)
  } else {
    test <- t.test(df_run_num_tests[[var]] ~ pneumonitis, data = df_run_num_tests)
  }
  summary_df_filtered <- df_run_num_tests %>% 
    group_by(pneumonitis) %>%
    summarise(
      mean = mean(.data[[var]], na.rm = TRUE),
      sd = sd(.data[[var]], na.rm = TRUE),
      n = n(),
      .groups = 'keep'
    )
  if (test$p.value < 0.001) {
    significance <- "***"
  } else if (test$p.value < 0.01) {
    significance <- "**"
  } else if (test$p.value < 0.05) {
    significance <- "*"
  } else if (test$p.value < 0.1) {
    significance <- "."
  } else {
    significance <- ""
  }
  results <- data.frame(
    variable = var,
    noPneumonitis = round(summary_df_filtered$mean[1], 2),
    percent0 = paste0("(±", round(summary_df_filtered$sd[1], 2), ")"),
    Pneumonitis = round(summary_df_filtered$mean[2], 2),
    percent1 = paste0("(±", round(summary_df_filtered$sd[2], 2), ")"),
    p_value = round(test$p.value, 3),
    significance = significance
  )
  return(results)
}
#Description: This function runs numerical tests (t-test or Wilcoxon test) depending on the normality of the data, then calculates summary statistics and returns the results.


#9. Define Function to Insert Row in Results
insert_row <- function(var, row){
  noPneumonitis <- ""
  Percent0 <- ""
  Pneumonitis <- ""
  Percent1 <- ""
  p_value <- ""
  significance <- ""
  if (var == "start") {
    var <- "Variable"
    noPneumonitis <- "Mean"
    Percent0 <- "+/- SD"
    Pneumonitis <- "Mean"
    Percent1 <- "+/- SD"
    p_value <- "p value"
    significance <- ""
  }
  else if (var == "count") {
    tbl <- table(df_filtered$pneumonitis)
    var <- ""
    noPneumonitis <- paste0("n = ", tbl[1], " (", paste0(round(tbl[1] / sum(tbl) * 100, 1), "%"), ")")
    Percent0 <- ""
    Pneumonitis <- paste0("n = ", tbl[2]," (", paste0(round(tbl[2]/sum(tbl)*100, 1), "%"), ")")
    Percent1 <- ""
    p_value <- "p value"
    significance <- ""
  }
  else if (var == "count3") {
    tbl <- table(df_filtered_subgroup$pneumonitis)
    var <- ""
    noPneumonitis <- paste0("n = ", tbl[1], " (", paste0(round(tbl[1] / sum(tbl) * 100, 1), "%"), ")")
    Percent0 <- ""
    Pneumonitis <- paste0("n = ", tbl[2]," (", paste0(round(tbl[2]/sum(tbl)*100, 1), "%"), ")")
    Percent1 <- ""
    p_value <- "p value"
    significance <- ""
  }
  new_row <- data.frame(Group = var, noPneumonitis, Percent0, Pneumonitis, Percent1, p_value, significance)
  results <<- rbind(results[0:row,], new_row, results[(row+1):nrow(results),])
}
#Description: This function inserts a new row into the results data frame, based on the given variable and row position.


#10. Set Variables for Loop Through Variables and Run Tests
variables <- c("suv_lung_mean","upper_suv_mean","lower_suv_mean","no_tumor_suv_mean",
               "suv_lung_max","upper_suv_max","lower_suv_max","no_tumor_suv_max",
               "suv_95","upper_suv_95","lower_suv_95","no_tumor_suv_95",
               "sul_lung_mean","upper_sul_mean","lower_sul_mean","no_tumor_sul_mean",
               "sul_lung_max","upper_sul_max","lower_sul_max","no_tumor_sul_max")

variables_subgroup <- c("suv_lung_max","upper_suv_max","lower_suv_max","no_tumor_suv_max",
                        "sul_lung_max","upper_sul_max","lower_sul_max","no_tumor_sul_max")

# Initialize Results Data Frame
results <- data.frame(Group = character(), 
                                  noPneumonitis = integer(), Percent0 = numeric(),
                                  Pneumonitis = integer(), Percent1 = numeric(),
                                  p_value = numeric(), significance = character())
# Loop Through Variables
for(i in 1:length(variables)){
  results[i,] <- run_num_tests(variables[i])
}

results_summary <- results
#Runs statistical tests on specified variables. The results are then stored in the results data frame.

#12. Insert Rows and Rename Variables
insert_row("count", 0)
insert_row("SUVMEAN", 1)
insert_row("SUVMAX", 6)
insert_row("SUV95", 11)
insert_row("SULMEAN", 16)
insert_row("SULMAX", 21)

names(results)[1:7] <- c("","No Pneumonitis", "", "Pneumonitis","","","")
rownames(results) <- 1:nrow(results)

results[results == "suv_lung_mean"] <- "whole lung"
results[results == "upper_suv_mean"] <- "upper lung"
results[results == "lower_suv_mean"] <- "lower lung"
results[results == "no_tumor_suv_mean"] <- "TFL"

results[results == "suv_lung_max"] <- "whole lung"
results[results == "upper_suv_max"] <- "upper lung"
results[results == "lower_suv_max"] <- "lower lung"
results[results == "no_tumor_suv_max"] <- "TFL"

results[results == "suv_95"] <- "whole lung"
results[results == "upper_suv_95"] <- "upper lung"
results[results == "lower_suv_95"] <- "lower lung"
results[results == "no_tumor_suv_95"] <- "TFL"

results[results == "sul_lung_mean"] <- "whole lung"
results[results == "upper_sul_mean"] <- "upper lung"
results[results == "lower_sul_mean"] <- "lower lung"
results[results == "no_tumor_sul_mean"] <- "TFL"

results[results == "sul_lung_max"] <- "whole lung"
results[results == "upper_sul_max"] <- "upper lung"
results[results == "lower_sul_max"] <- "lower lung"
results[results == "no_tumor_sul_max"] <- "TFL"
#Description: Inserts rows into the results data frame to group the variables into meaningful categories and updates the variable names for better readability and interpretation.

#12. Save Results
write.csv(results, file = "Table 2.csv", row.names = FALSE)
print("saved: Table 2.csv")
#Saves the results to a CSV file. 


#13. Set Variables for Loop Through Variables and Run Tests
variables_subgroup <- c("")

# Initialize Results Data Frame
results <- data.frame(Group = character(), 
                      noPneumonitis = integer(), Percent0 = numeric(),
                      Pneumonitis = integer(), Percent1 = numeric(),
                      p_value = numeric(), significance = character())
# Loop Through Variables
for(i in 1:length(variables)){
  results[i,] <- run_num_tests(variables[i])
}

results_summary <- rbind(results_summary, results)
#Runs statistical tests on specified variables. The results are then stored in the results data frame.


#14. Insert Rows and Rename Variables
insert_row("count", 0)
insert_row("SUVMEAN", 1)
insert_row("SUVMAX", 6)
insert_row("SUV95", 11)
insert_row("SULMEAN", 16)
insert_row("SULMAX", 21)

names(results)[1:7] <- c("","No Pneumonitis", "", "Pneumonitis","","","")
rownames(results) <- 1:nrow(results)

results[results == "suv_lung_mean"] <- "whole lung"
results[results == "upper_suv_mean"] <- "upper lung"
results[results == "lower_suv_mean"] <- "lower lung"
results[results == "no_tumor_suv_mean"] <- "TFL"

results[results == "suv_lung_max"] <- "whole lung"
results[results == "upper_suv_max"] <- "upper lung"
results[results == "lower_suv_max"] <- "lower lung"
results[results == "no_tumor_suv_max"] <- "TFL"

results[results == "suv_95"] <- "whole lung"
results[results == "upper_suv_95"] <- "upper lung"
results[results == "lower_suv_95"] <- "lower lung"
results[results == "no_tumor_suv_95"] <- "TFL"

results[results == "sul_lung_mean"] <- "whole lung"
results[results == "upper_sul_mean"] <- "upper lung"
results[results == "lower_sul_mean"] <- "lower lung"
results[results == "no_tumor_sul_mean"] <- "TFL"

results[results == "sul_lung_max"] <- "whole lung"
results[results == "upper_sul_max"] <- "upper lung"
results[results == "lower_sul_max"] <- "lower lung"
results[results == "no_tumor_sul_max"] <- "TFL"
#Description: Inserts rows into the results data frame to group the variables into meaningful categories and updates the variable names for better readability and interpretation.


#15. Print Results
print(results)
#Description: Prints a summary of the results


#16. Save Results
write.csv(results, file = "Table S2.csv", row.names = FALSE)
print("saved: Table S2.csv")
#Saves the results to a CSV file. 




#13. Set Variables for Loop Through Variables and Run Tests
variables_subgroup <- c("suv_lung_mean","upper_suv_mean","lower_suv_mean","no_tumor_suv_mean",
                        "suv_lung_max","upper_suv_max","lower_suv_max","no_tumor_suv_max",
                        "suv_95","upper_suv_95","lower_suv_95","no_tumor_suv_95",
                        "sul_lung_mean","upper_sul_mean","lower_sul_mean","no_tumor_sul_mean",
                        "sul_lung_max","upper_sul_max","lower_sul_max","no_tumor_sul_max")

# Initialize Results Data Frame
results <- data.frame(Group = character(), 
                      noPneumonitis = integer(), Percent0 = numeric(),
                      Pneumonitis = integer(), Percent1 = numeric(),
                      p_value = numeric(), significance = character())
# Loop Through Variables
for(i in 1:length(variables)){
  results[i,] <- run_num_tests(variables[i])
}

results_summary <- rbind(results_summary, results)
#Runs statistical tests on specified variables. The results are then stored in the results data frame.


#14. Insert Rows and Rename Variables
insert_row("count3", 0)
insert_row("SUVMEAN", 1)
insert_row("SUVMAX", 6)
insert_row("SUV95", 11)
insert_row("SULMEAN", 16)
insert_row("SULMAX", 21)

names(results)[1:7] <- c("","No Pneumonitis", "", "Pneumonitis","","","")
rownames(results) <- 1:nrow(results)

results[results == "suv_lung_mean"] <- "whole lung"
results[results == "upper_suv_mean"] <- "upper lung"
results[results == "lower_suv_mean"] <- "lower lung"
results[results == "no_tumor_suv_mean"] <- "TFL"

results[results == "suv_lung_max"] <- "whole lung"
results[results == "upper_suv_max"] <- "upper lung"
results[results == "lower_suv_max"] <- "lower lung"
results[results == "no_tumor_suv_max"] <- "TFL"

results[results == "suv_95"] <- "whole lung"
results[results == "upper_suv_95"] <- "upper lung"
results[results == "lower_suv_95"] <- "lower lung"
results[results == "no_tumor_suv_95"] <- "TFL"

results[results == "sul_lung_mean"] <- "whole lung"
results[results == "upper_sul_mean"] <- "upper lung"
results[results == "lower_sul_mean"] <- "lower lung"
results[results == "no_tumor_sul_mean"] <- "TFL"

results[results == "sul_lung_max"] <- "whole lung"
results[results == "upper_sul_max"] <- "upper lung"
results[results == "lower_sul_max"] <- "lower lung"
results[results == "no_tumor_sul_max"] <- "TFL"
#Description: Inserts rows into the results data frame to group the variables into meaningful categories and updates the variable names for better readability and interpretation.


#15. Print Results
print(results)
#Description: Prints a summary of the results


#16. Save Results
write.csv(results, file = "Table S3.csv", row.names = FALSE)
print("saved: Table S3.csv")
#Saves the results to a CSV file. 

#17. Print Additional Notes
print (paste("Patients who developed cinrPneumonitis after the initiation of ICI therapy did not show increased pretreatment radioligand uptake of the whole lung, the upper lung, the lower lung or the tumor free lung (p =",
             round(min(as.numeric(results_summary$p_value[c(1:60)])),3), "–",
             round(max(as.numeric(results_summary$p_value[c(1:60)])),3), ")."))
#Description: Prints a summary of the results

